from pprint import pprint
import argparse
import os
import re
from datetime import datetime, timedelta
import polars as pl
import asyncio


def read_raw_data(file_path: str) -> tuple[pl.DataFrame, str]:
    """
    Read BL-420N raw data from .txt and return corresponding DataFrame and unit.

    Parameters
    -------
    file_path : str
        Path to the file. Format is as followed:

        ```
        ┌─────────────────────────┬───────┬──────────────┐
        │ timestamp               ┆ ch2   ┆ ch2 incident │
        │ ---                     ┆ ---   ┆ ---          │
        │ datetime[μs]            ┆ f64   ┆ str          │
        ╞═════════════════════════╪═══════╪══════════════╡
        │ …                       ┆ …     ┆ …            │
        └─────────────────────────┴───────┴──────────────┘
        ```

    Returns
    -------
    df: polars.DataFrame
        DataFrame with the data
    unit: str
        Unit of the data
    """
    with open(file_path, "r", encoding="gb2312", newline="\r\n") as reader:

        # get sample rate

        reader.readline()
        line = reader.readline()

        pattern = r".+总共(?P<samples>\d+)个数据点.+：(?P<rate>\d+) Hz"

        if m := re.match(pattern, line):
            _ = m.group("samples")
            sample_rate = m.group("rate")
        else:
            raise Exception(
                f"Target line {line} does not match regex pattern!")

        interval = timedelta(microseconds=1000000 / int(sample_rate))

        # get channel index and unit

        pattern = r"^(?P<channel>\d+)通道数据\((?P<unit>.+)\)\s+事件\s*$"

        line = reader.readline()

        if m := re.match(pattern, line):
            channel = m.group("channel")
            unit: str = m.group("unit")
        else:
            raise Exception(
                f"Target line {line} does not match regex pattern!")

        print(f"{channel=}, {unit=}")

        # convert raw data to polars.DataFrame

        # initialise empty polars.DataFrame
        df = pl.DataFrame(schema={
                          "timestamp": pl.Datetime, f"ch{channel}": pl.Float64, f"ch{channel} incident": pl.Utf8})

        t = datetime.today()
        t = t.replace(hour=0, minute=0, second=0, microsecond=0)
        print(t)

        pattern = r"^(?P<value>-?\d+\.\d+)\s*(?P<incident>.*)\s*\r\n$"

        n = 0
        while line := reader.readline():
            t = t + interval
            if m := re.match(pattern, line):
                value = m.group("value")
                incident = m.group("incident")
                if not incident:
                    incident = None
            else:
                raise Exception(
                    f"Target {line=} does not match regex pattern!")
            n += 1

            # add data to dataframe
            temp_df = pl.DataFrame(
                data={
                    "timestamp": [t],
                    f"ch{channel}": [float(value)],
                    f"ch{channel} incident": [incident]
                }
            )
            df.extend(temp_df)

        df.sort(pl.col("timestamp"))

        return df, unit


async def async_read_raw_data(file_path: str, buf_line: int = 5) -> tuple[pl.DataFrame, str]:
    """
    deprecated. async cause unpredictable file reading bug.
    Read BL-420N raw data from .txt and return corresponding DataFrame and unit.
    This function is faster than read_raw_data,
    for async reading is used to avoid time spent on waiting
    and generate the dataframe once and for all.

    Warning: This API is unstable and will be changed in the future.

    Parameters
    -------
    file_path : str
        Path to the file. Format is as followed:

        ```
        ┌─────────────────────────┬───────┬──────────────┐
        │ timestamp               ┆ ch2   ┆ ch2 incident │
        │ ---                     ┆ ---   ┆ ---          │
        │ datetime[μs]            ┆ f64   ┆ str          │
        ╞═════════════════════════╪═══════╪══════════════╡
        │ …                       ┆ …     ┆ …            │
        └─────────────────────────┴───────┴──────────────┘
        ```

    Returns
    -------
    df: polars.DataFrame
        DataFrame with the data
    unit: str
        Unit of the data
    """

    import aiofiles

    def count_time(interval: timedelta):
        t = datetime.today()
        t = t.replace(hour=0, minute=0, second=0, microsecond=0)
        while True:
            yield t
            t = t + interval

    async with aiofiles.open(file_path, "r", encoding="gb2312", newline="\r\n") as reader:

        # get sample rate

        await reader.readline()
        line = await reader.readline()

        # TODO:  获取测量的是什么数据。pattern = r"(?P<project>\d+)，总共(?P<samples>\d+)个数据点.+：(?P<rate>\d+) Hz"
        pattern = r".+总共(?P<samples>\d+)个数据点.+：(?P<rate>\d+) Hz"

        if m := re.match(pattern, line):
            _ = m.group("samples")
            sample_rate = m.group("rate")
        else:
            raise Exception(
                f"Target line {line} does not match regex pattern!")

        interval = timedelta(microseconds=1000000 / int(sample_rate))

        # get channel index and unit

        pattern = r"^(?P<channel>\d+)通道数据\((?P<unit>.+)\)\s+事件\s*$"

        line = await reader.readline()

        if m := re.match(pattern, line):
            channel = m.group("channel")
            unit: str = m.group("unit")
        else:
            raise Exception(
                f"Target line {line} does not match regex pattern!")

        print(f"{channel=}, {unit=}")

        # convert raw data to polars.DataFrame

        # initialise empty polars.DataFrame
        df = pl.DataFrame(schema={
                          "timestamp": pl.Datetime, f"ch{channel}": pl.Float64, f"ch{channel} incident": pl.Utf8})

        t = datetime.today()
        t = t.replace(hour=0, minute=0, second=0, microsecond=0)
        print(t)
        time_gen = count_time(interval)

        pattern = r"^(?P<value>-?\d+\.\d+)\s*(?P<incident>.*)\s*\r\n$"

        # Read buf_size lines and generate the dataframe
        while True:
            lines = await asyncio.gather(*[reader.readline() for _ in range(buf_line)])
            # drop empty lines
            lines_content = [_parse_line(line) for line in lines if (
                line != "") & (line != "\n")]
            if len(lines_content) == 0:
                break
            # generate dataframe
            # WARNING: t should be adjusted.
            temp_df = pl.DataFrame(
                data={
                    "timestamp": [next(time_gen) for _ in range(len(lines_content))],
                    f"ch{channel}": [line[0] for line in lines_content],
                    f"ch{channel} incident": [line[1] for line in lines_content]
                }
            )
            df.extend(temp_df)
    return df, unit


def _parse_line(line: str) -> tuple[float, str | None]:
    """
    Returns
    -------
    value: float
        Value of the data
    incident: str|None
        Incident of the data
    """
    pattern = r"^(?P<value>-?\d+\.\d+)\s*(?P<incident>.*)\s*\r\n$"

    if m := re.match(pattern, line):
        value = m.group("value")
        incident = m.group("incident")
        if not incident:
            incident = None
    else:
        raise Exception(f"Target {line=} does not match regex pattern!")

    return float(value), incident
    pass


def native_read_raw_data(file_path: str) -> tuple[pl.DataFrame, str, str]:
    """
    Read BL-420N raw data from .txt and return corresponding DataFrame and unit.
    This function is faster than any of the others above,
    for native polars expression is used to generate the dataframe.

    Parameters
    -------
    file_path : str
        Path to the file.

    Returns
    -------
    df: polars.DataFrame
        DataFrame with the data. Format is as followed:

        ```
        ┌─────────────────────────┬───────┬──────────────┐
        │ timestamp               ┆ ch2   ┆ ch2 incident │
        │ ---                     ┆ ---   ┆ ---          │
        │ datetime[μs]            ┆ f64   ┆ str          │
        ╞═════════════════════════╪═══════╪══════════════╡
        │ 2025-01-01 00:00:00     ┆ 0.2   ┆ null         │
        │ 2025-01-01 00:00:00.010 ┆ 0.25  ┆ null         │
        │ …                       ┆ …     ┆ …            │
        └─────────────────────────┴───────┴──────────────┘
        ```

    unit: str
        Unit of the data

    project_name: str
        Project name of the data. e.g. 动脉血压, 中心静脉压, ...
    """

    with open(file_path, "r", encoding="gb2312", newline="\r\n") as reader:

        # get sample rate

        reader.readline()
        line = reader.readline()

        pattern = r"(?P<project>\D+)，总共(?P<samples>\d+)个数据点.+：(?P<rate>\d+) Hz"

        if m := re.match(pattern, line):
            project_name = m.group("project")
            sample_count = m.group("samples")
            sample_rate = m.group("rate")
        else:
            raise Exception(
                f"Target line `{line}` does not match regex pattern!")

        print(f"{project_name=}, {sample_count} samples in total, {sample_rate=}")

        interval = timedelta(microseconds=1000000 / int(sample_rate))

        # get channel index and unit

        pattern = r"^(?P<channel>\d+)通道数据\((?P<unit>.+)\)\s+事件\s*$"

        line = reader.readline()

        if m := re.match(pattern, line):
            channel = m.group("channel")
            unit: str = m.group("unit")
        else:
            raise Exception(
                f"Target line {line} does not match regex pattern!")

        print(f"{channel=}, {unit=}")

        # convert raw data to polars.DataFrame

        # initialise empty polars.DataFrame
        df = pl.DataFrame(schema={
                          "timestamp": pl.Datetime, f"ch{channel}": pl.Float64, f"ch{channel} incident": pl.Utf8})

        t_start = datetime.today()
        t_start = t_start.replace(hour=0, minute=0, second=0, microsecond=0)
        print(f"time starts from {t_start}")

    df_content = pl.read_csv(
        file_path,
        has_header=False,
        new_columns=["raw", "incident"],
        separator="\t",
        skip_rows=3,
        encoding="gb2312"
    )

    df_content = df_content.with_columns(
        pl.col.raw.str.strip_chars().cast(pl.Float32).alias(f"ch{channel}"),
        pl.col.incident.str.strip_chars()
    ).with_row_index()

    df = df_content.select(
        pl.duration(microseconds=(pl.col.index * interval)
                    ).alias("timestamp") + t_start,
        pl.col(f"ch{channel}"),
        pl.col("incident")
    )

    return df, unit, project_name


if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--input", "-i", help="Input BL-420N data", required=True)
    args = parser.parse_args()

    if os.path.isfile(args.input) or os.path.islink(args.input):
        file_path = os.path.realpath(args.input)
    else:
        parser.error("--input doesn't get valid file path!")

    '''result = read_raw_data(file_path)

    print(result)'''

    # result = asyncio.run(async_read_raw_data(file_path, buf_line=1))

    result = native_read_raw_data(file_path)

    pprint(result)
